################################################################################################################################################################################################################################################
# The Political Consequences of Ethnically Targeted Incarceration: Evidence from Japanese-American Internment During WWII
# Replication Code: Manuscript Summary Statistics and Partial SI
################################################################################################################################################################################################################################################

################################################################################################################################################################################################################################################
# Setup
################################################################################################################################################################################################################################################
# Packages
  require(haven)
  require(data.table)
  require(stargazer)
  require(ggplot2)
  require(tidyverse)

# Functions
  splitit <- function(x,splitchar,n) sapply(strsplit(as.character(x), splitchar), "[", n)
  getmode <- function(v) {
    uniqv <- unique(v)
    uniqv[which.max(tabulate(match(v, uniqv)))]
  }

  as.numeric.factor <- function(x) {as.numeric(levels(x))[x]}
  
################################################################################################################################################################################################################################################
# Directories
################################################################################################################################################################################################################################################
# This sets your working directory to the location of the source file. Note that this command is specific to RStudio. You'll need to change
# this if you're working directly in R or some other IDE (see https://stackoverflow.com/questions/13672720/r-command-for-setting-working-directory-to-source-file-location-in-rstudio)
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
  
################################################################################################################################################################################################################################################
# Load Data
################################################################################################################################################################################################################################################
# JARP Survey
  full <- readRDS("jarpfull_plus.RDS")

# WRA Data on Interned Population  
  wra <- read_dta("internment_raw.dta")
  wra <- data.frame(wra)

# Codebooks
  # WRA State Codes
    wra.state <- read.csv("wra_state.csv", stringsAsFactors = F)

  # WRA Gender and Marital Status Codes  
    wra.gender <- read.csv("wra_gender_marital.csv", stringsAsFactors = F)

  # Pre-Camp Location Codes
    # JARP (called postcamp_location but the location codes here actually apply to everything)
      locations <- read.csv("postcamp_location.csv", stringsAsFactors = F)
      locations$precamp.state.code <- substr(as.character(locations$postcamp.code), 1, 2)
      locations <- locations[ , c("postcamp.state", "precamp.state.code")]
      locations <- unique(locations)
      colnames(locations)[1] <- "precamp.state.name"

  # WRA Assembly Centers  
    wra.assembly <- read.csv("wra_assembly_centers.csv", stringsAsFactors = F)
    colnames(wra.assembly) <- c("assembly", "assembly.center", "comments")
    wra.assembly$assembly.center.label <- c("Fresno", "Manzanar", "Marysville", "Mayer", "Merced", "None", "Pinedale", "Pomona",
                                            "Portland", "Puyallup", "Sacramento", "Salinas", "Santa Anita", "Stockton", "Tanforan",
                                            "Tulare", "Turlock")

################################################################################################################################################################################################################################################
# Where Did People Come From and Where Did They Go?
################################################################################################################################################################################################################################################
# Pre-Camp Locations for 3 Largest States (CA, OR, WA)
  # WRA Data
    # Identify Pre-Internment States in WRA Data
      wra$origin_state <- substr(wra$address, 1, 2)
      wra <- merge(wra, wra.state, by.x = "origin_state", by.y = "state", all.x = T)
      wra$state_name <- splitit(wra$state.desc, " - ", 2)
      wra$state_name[grepl("County", wra$state_name) == 1 | grepl("Hawaii", wra$state_name) == 1] <- "Hawaii"
    
    # Identify Internment Camps in WRA Data
      wracamps <- data.frame(camp.code = c(6, 2, 3, 8, 7, 0, 1, 5, 9, 4),
                             camp.name = c("Topaz", "Poston", "Gila River", "Granada", "Heart Mountain", "Jerome",
                                           "Manzanar", "Minidoka", "Rohwer", "Tule Lake"))
      
      wra <- merge(wra, wracamps, by.x = "camp", by.y = "camp.code", all.x = T)
      
    # Export Plot  
      pdf("ms_fig8b.pdf")
        ggplot(data = wra[wra$state_name %in% c("California", "Washington", "Oregon") & is.na(wra$camp.name) == 0,], aes(x = camp.name)) + 
        geom_bar(col = "cornflowerblue", fill = "cornflowerblue") + facet_wrap(~state_name) + xlab("") + ylab("Internees") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), text = element_text(size = 20), plot.title = element_text(hjust = 0.5))
      dev.off()
      
  # JARP Data
    # Get a uniform variable for precamp location across generations  
      full$i.precamploc <- as.character(full$i.precamploc) ; full$n.precamp <- as.character(full$n.precamp) ; full$s.precamp <- as.character(full$s.precamp)
      full$precamp <- ifelse(full$generation == "Issei", full$i.precamploc,
                             ifelse(full$generation == "Nisei", full$n.precamp,
                                    ifelse(full$generation == "Sansei", full$s.precamp, NA)))
      
    # Identify States using first 2 digits of JARP location code
      full$precamp.state <- substr(full$precamp, 1, 2)
      
      full <- merge(full, locations, by.x = "precamp.state", by.y = "precamp.state.code", all.x = T)  
      
      full$camp <- factor(full$camp, levels = c("No Data", "Not evacuated or interned", "Missoula, MT", "Lordsburg, NM", "Livingston, LA", "Fort Sill, OK","Bismark, ND", "Crystal City, TX", "Santa Fe, NM","Rivers, AZ", "Amache, CO", "Heart Mountain, WY", "Denson, AR","Manzanar, CA", "Hunt, ID", "Poston, AZ","McGehee, AR", "Topaz, UT", "Newell, CA"), ordered = T)
      
      # Define Treatment Status    
      full$anyfam <- ifelse(full$internedinfam > 0, 1, 0)
      
      full$treatmentstatus <- ifelse(full$anyfam == 1 & full$interned.numeric == "Yes", "self.fam",
                                     ifelse(full$anyfam == 0 & full$interned.numeric == "Yes", "self.only",
                                            ifelse(full$anyfam == 1 & full$interned.numeric == "No", "fam.only",
                                                   ifelse(full$anyfam == 0 & full$interned.numeric == "No", "control", NA))))
      
      
    # Export Figure
      pdf("ms_fig8a.pdf")
        ggplot(data = full[full$precamp.state.name %in% c("California", "Washington", "Oregon") & full$treatmentstatus %in% c("self.fam", "self.only") & full$camp %in% c("Rivers, AZ", "Amache, CO", "Heart Mountain, WY", "Denson, AR", "Manzanar, CA", "Hunt, ID", "Poston, AZ", "McGehee, AR","Topaz, UT", "Newell, CA"),], aes(x = camp)) + 
        geom_bar(col = "cornflowerblue", fill = "cornflowerblue") + 
        facet_wrap(~precamp.state.name) + xlab("") + ylab("Internees") +
        scale_x_discrete(limits = c("Rivers, AZ", "Amache, CO", "Heart Mountain, WY", "Denson, AR", "Manzanar, CA", "Hunt, ID", "Poston, AZ", "McGehee, AR","Topaz, UT", "Newell, CA"), labels = c("Gila River", "Granada", "Heart Mountain", "Jerome", "Manzanar", "Minidoka", "Poston", "Rohwer", "Topaz", "Tule Lake")) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), text = element_text(size = 20),
                plot.title = element_text(hjust = 0.5))
      dev.off()
      
# Comparison of Assembly Center and Internment Camp Assignment for LA Internees
  # Merge Assembly Center codes into WRA Data    
    wra <- merge(wra, wra.assembly, by = "assembly", all.x = T)
    levels(wra$camp.name)[levels(wra$camp.name)=="Heart Mountain"] <- "Heart Mntn."
  
  # Export Figure
    pdf("ms_fig3a.pdf")
      ggplot(data = wra[wra$address == "13015",], aes(x = assembly.center.label)) + 
      geom_bar(col = "cornflowerblue", fill = "cornflowerblue") + xlab("") + ylab("Internees") + ylim(0, 10000) +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), text = element_text(size = 20),
            plot.title = element_text(hjust = 0.5))
    dev.off()
    
    pdf("ms_fig3b.pdf")
      ggplot(data = wra[wra$address == "13015",], aes(x = camp.name)) + ylim(0, 10000)+
      geom_bar(col = "cornflowerblue", fill = "cornflowerblue") + xlab("") + ylab("Internees") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), text = element_text(size = 20),
            plot.title = element_text(hjust = 0.5))
    dev.off()    
    
################################################################################################################################################################################################################################################
# JARP Data Overview
################################################################################################################################################################################################################################################
# Overview information about JARP Data
  out <- data.table(full)
  out <- out[ , list(Observations = .N, Age = round(mean(as.numeric(age), na.rm = T), 0), Gender = round(sum(gender == "Male", na.rm = T) / length(gender), 2),
                     Married = round(sum(marstat == "Married", na.rm = T) / length(marstat), 2),
                     California = sum(precamp.state.name == "California", na.rm = T),
                     Oregon = sum(precamp.state.name == "Oregon", na.rm = T),
                     Washington = sum(precamp.state.name == "Washington", na.rm = T),
                     Other = sum(!(precamp.state.name %in% c("California", "Oregon","Washington")), na.rm = T),
                     Interned = round(sum(interned.numeric == "Yes", na.rm = T) / .N, 2),
                     Families = round(sum(internedinfam > 0) / .N, 2)), by = list(generation)]
  setkey(out, generation)
  totalrow <- data.frame(generation = "Total", Observations = nrow(full), Age = round(mean(as.numeric(full$age), na.rm = T), 0), Gender = round(sum(full$gender == "Male", na.rm = T) / length(full$gender), 2),
                Married = round(sum(full$marstat == "Married", na.rm = T) / length(full$marstat), 2), California = sum(full$precamp.state.name == "California", na.rm = T),
                Oregon = sum(full$precamp.state.name == "Oregon", na.rm = T), Washington = sum(full$precamp.state.name == "Washington", na.rm = T), Other = sum(!(full$precamp.state.name %in% c("California", "Oregon","Washington")), na.rm = T),
                Interned = round(mean(full$interned.numeric == "Yes", na.rm = T), 2), Families = round(mean(full$internedinfam > 0), 2))
  
  out <- data.frame(rbind(out, totalrow))
  colnames(out)[1] <- "Generation"

# Export Table 2  
  stargazer(out, label = "tab:jarpsumm", title = "JARP Sample Demographic Information", rownames = F, colnames = T, column.sep.width = "-6pt",
            summary = F, out = "ms_table2.tex", no.space = T, digits = 2,
            table.layout ="-d-t-s=n", font.size = "tiny",
            notes = c("Notes: Age represents mean age; gender represents proportion male, and married ",
                      "represents proportion married. California, Oregon, Washington, and Other represent",
                      "counts of Japanese Americans living in each location on the eve of the internment period.",
                      "Interned represents the proportion of respondents in each generation who were interned", 
                      "Families represents the proportion respondents in a family where at least one member was interned"))
  
# Summary of Treatment Status by Cohort
  full$anyfam <- ifelse(full$internedinfam > 0, 1, 0)
  
  out <- data.table(full)
  out <- out[, list(self.fam = sum(anyfam == 1 & interned.numeric == "Yes", na.rm = T),
                    self.only = sum(anyfam == 0 & interned.numeric == "Yes", na.rm = T),
                    fam.only = sum(anyfam == 1 & interned.numeric == "No", na.rm = T),
                    control = sum(anyfam == 0 & interned.numeric == "No", na.rm = T)), by = list(generation)]
  
################################################################################################################################################################################################################################################
# Camp Features
################################################################################################################################################################################################################################################
  camps <- unique(full$camp[full$camp != "No Data" & full$camp != "Not evacuated or interned" & is.na(full$camp) == 0])
  
  # Just looked counties up manually for these in order. Some places are in multiple counties, so the county listed is the one that contains most of the camp
  # Key to major camps: 
  # Gila River = Rivers, AZ in Pinal County, AZ
  # Granada = Amache, CO in Prowers County, CO
  # Heart Mountain = Heart Mountain, Wyoming in Park County
  # Jerome = Denson, AR mostly in Drew but partly in Chicot county
  # Manzanar = Manzanar, CA in Inyo County
  # Minidoka = Hunt, ID in Jerome County, Idaho
  # Poston = Poston, AZ in Yuma County
  # Rohwer = McGehee, AR in Desha County Arkansas
  # Topaz = Topaz, UT in Utah
  # Newell, CA is Tule Lake, which is mostly Modoc but partly in Siskiyou county
  # Camp Livingston, LA is on the Rapides Parish and Grant Parish Lines
  
  # Merge in County Names for Various Camps
  camps <- data.frame(camps = camps, V1 = cbind(splitit(camps, ",", 1)))
  camps$counties <- c("Burleigh", "Zavala", "Missoula", "Santa Fe", "Hidalgo", "Comanche", "Rapides", "Millard", "Yuma", "Pinal", "Prowers", "Drew", "Inyo", "Jerome", "Park", "Desha", "Modoc")
  camps$stateabb <- splitit(camps$camps, ", ", 2)
  camps <- merge(camps, data.frame(cbind(state.abb, state.name)), by.x = "stateabb", by.y = "state.abb")
  camps <- camps[, c("camps", "counties", "state.name")]
  colnames(camps) <- c("camp", "county", "state")
  camps$county <- toupper(camps$county) ; camps$state <- toupper(camps$state)
  
  # Identify Additional Camp Treatments
  # Watch Towers, Military Police Buildings, Strikes (see Confinement and Ethnicity book in References)
  # towers = watch towers in camp; bldgs = military buildings on premises; strikes = strikes or work stoppages;
  # force = camp personnel use force or violence against internees; violence = violence among internees or between internees and civilian population;
  # peakpop = peak population (Table 3.2 in Confinement and Ethnicity)
  # Other sources: http://newmexicohistory.org/places/lordsburg-internment-pow-camp (Lordsburg), http://encyclopedia.densho.org/Fort_Sill_(detention_facility)/
  # 
  severity <- data.frame(camp = c("Rivers, AZ", "Amache, CO", "Heart Mountain, WY", "Denson, AR", "Manzanar, CA", "Hunt, ID",
                                  "Poston, AZ", "McGehee, AR", "Topaz, UT", "Newell, CA", "Fort Sill, OK", "Lordsburg, NM", "Bismarck, ND",
                                  "Crystal City, TX", "Livingston, LA"),
                         campname = c("Gila River", "Amache", "Heart Mountain", "Jerome", "Manzanar", "Minidoka",
                                      "Poston", "Rohwer", "Topaz", "Tule Lake", "Fort Sill", "Lordsburg", "Fort Lincoln",
                                      "Crystal City", "Livingston"),
                         towers = c(1, 6, 9, 7, 8, 8, 0, 8, 7, 19, NA, NA, NA, NA, NA),
                         bldgs = c(15, 15, 19, 12, 13, 15, 12, 12, NA, 63, NA, NA, NA, NA, NA),
                         strikes = c(0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1),
                         force = c(0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0),
                         violence = c(0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),
                         peakpop = c(13348, 7318, 10767, 8497, 10046, 9397, 17814, 8475, 8130, 18789, 707, 2500, 1518, 4000, 1123))
  
  severity$militarism <- (severity$towers) / (severity$peakpop / 1000)
  
  severity$stateview <- ifelse(severity$strikes == 1 | severity$force == 1, 1, 0)
  
  severity$any <- ifelse(severity$strikes == 1 | severity$force == 1 | severity$strikes == 1, 1, 0)
  
  camps <- merge(camps, severity, by = "camp")
  
  camps$births <- c(415, 153, 239, NA, 550, 489, NA, NA, 541, 418, 1490, 793, 662, 384)
  camps$deaths <- c(106, 17, 76, NA, 182, 193, NA, NA, 146, 168, 331, 300, 221, 139)
  
  camps.out <- camps[, c("campname", "state","peakpop", "births", "deaths","towers", "bldgs","strikes", "force", "violence")]
  colnames(camps.out) <- c("Camp", "State", "Peak Pop.", "Births", "Deaths","Guard Towers", "Military Bldgs.", "Demonstrations", "Use of Force", "Violence")
  camps.out$State <- paste(substr(camps.out$State, 1, 1), tolower(substr(camps.out$State, 2, nchar(camps.out$State))), sep = "")
  camps.out$State[camps.out$State == "New mexico"] <- "New Mexico"
  camps.out <- camps.out[camps.out$Camp %in% c("Amache", "Jerome", "Heart Mountain", "Minidoka", "Manzanar", "Rohwer", "Tule Lake", "Poston", "Gila River", "Topaz"), c("Camp", "State", "Peak Pop.", "Guard Towers", "Demonstrations", "Use of Force", "Violence")]
  
# Export Table 1  
  stargazer(camps.out, label = "tab:camps", title = "Internment Camp Characteristics", rownames = F, colnames = T, column.sep.width = "-6.5pt",
            summary = F, out = "ms_table1.tex", no.space = T, font.size = "tiny",
            notes = c("Sources: Ishizuka 2016, Burton, et. al. 2002, Densho Encyclopedia",
                      "(see \\url{http://encyclopedia.densho.org/Fort_Sill_(detention_facility)/}),",
                      "New Mexico Office of the State Historian, ",
                      "(see \\url{http://newmexicohistory.org/places/lordsburg-internment-pow-camp})",
                      "Note: This represents a summary of the 10 major Japanese-American internment camps. Smaller detention facilities",
                      "were in active operation throughout the United States during WWII and we include these in our analysis in order to",
                      "preserve power and leverage additional information about internment length and location. These 10 camps account for",
                      "2,545 of the 2,777 interned Japanese Americans in the JARP."))

    
################################################################################################################################################################################################################################################
# Japanese Americans in and Outside of the Exclusion Zone: Comparison
################################################################################################################################################################################################################################################
# Source: https://www2.census.gov/library/publications/decennial/1940/population-nonwhite/population-nonwhite.pdf 
# Characteristics of the nonwhite population by race for 1940 Census
  
# SES by State

  # Start with age (table 33)  
    ca <- c(93717, 6263, 7041, 10583, 14623, 12200, 6563, 3912, 5456, 5838, 4672, 5906, 4794, 3671, 2007, 188)  
    co <- c(2734, 189, 254, 418, 557, 343, 80, 33, 70, 136, 129, 176, 156, 120, 66, 7)
    ny <- c(2538, 160, 156, 119, 132, 129, 234, 229, 276, 298, 277, 240, 130, 90, 58, 10)
    or <- c(4071, 190, 268, 504, 717, 537, 209, 101, 202, 253, 254, 310, 260, 177, 84, 5)
    ut <- c(2210, 117, 181, 344, 438, 225, 72, 28, 91, 129, 117, 177, 114, 123, 48, 6)
    wa <- c(14565, 705, 975, 1682, 2675, 1831, 877, 450, 686, 939, 892, 1148, 827, 583, 269, 26)

  # Combine States into single df
    comp.ja <- data.frame(rbind(ca, co, ny, or, ut, wa))
    colnames(comp.ja) <- c("all", "under5", "5to9", "10to14", "15to19", "20to24", "25to29", "30to34", "35to39", "40to44", "45to49", "50to54", "55to59", "60to64", "65to74", "over75")
    comp.ja$state <- rownames(comp.ja)
    comp.ja$excl <- c(1, 0, 0, 1, 0, 1)
    
  # Add Gender  
    comp.ja$male <- c(52550, 1505, 1781, 2271, 1263, 8093)
    comp.ja$malepct <- round(comp.ja$male / comp.ja$all * 100, 2)
    
  # Add proportion married  
    comp.ja$married <- c(18351+16450, 413+378, 721+348, 817+740, 370+327, 2804+2516)
    comp.ja$propmarried <- round(comp.ja$married / rowSums(comp.ja[,5:16]) * 100, 2)
    
  # Add share of population that might be too young to vote/care
    comp.ja$novotes <- round(rowSums(comp.ja[, 2:4]) / comp.ja[,1] * 100, 2)
    
  # Add median school years
    comp.ja$edyrs <- c(8.6, 8.1, 12.3, 8.5, 8.1, 9.0)
    comp.ja$hs <- c(9817, 155, 400, 417, 159, 1908)
  
    # Get proportion of adults 25+ who finished hs
      comp.ja$hscomp <- round(comp.ja$hs / rowSums(comp.ja[ , 7:16]) * 100, 2)
      
  # Add Labor Force
    comp.ja$emp <- c(40415, 879, 1356, 1772, 786, 6548)
    comp.ja$fplus <- c(72488, 1972, 2121, 3227, 1647, 11618)
    
    comp.ja$pctemp <- round(comp.ja$emp / comp.ja$fplus * 100, 2)
    
    comp.ja$profwork <- c(960, 25, 90, 45, 16, 152)
    comp.ja$farmers <- c(5797, 597, 8, 849, 204,846)
    
    comp.ja$pctprof <- round(comp.ja$profwork / comp.ja$emp * 100, 2)
    comp.ja$pctfarmers <- round(comp.ja$farmers / comp.ja$emp * 100, 2)
    
    comp.ja$state <- toupper(comp.ja$state)
    
    comp.ja <- comp.ja[ , c("state", "excl", "all", "malepct", "propmarried", "novotes", "hscomp", "edyrs", "pctemp", "pctprof", "pctfarmers")]
    colnames(comp.ja) <- c("State", "Exclusion Zone", "Population", "Prop. Male", "Prop. Married", "Prop. Under 20", "Pop. Completed HS", "Median Yrs. Ed.",
                           "Prop. Employed", "Prop. Professional", "Prop. Farmers")
  
    
# T- tests in and outside of exclusion zone    
    diffmeans <- data.frame(vars = c("Prop. Male", "Prop. Married", "Prop. Under 20", "Prop. Completed HS", "Median Yrs. Ed.", "Prop. Employed", "Prop. Professional", "Prop. Farmers"),
                            pvals = c(t.test(comp.ja$`Prop. Male`[comp.ja$`Exclusion Zone` == 1], comp.ja$`Prop. Male`[comp.ja$`Exclusion Zone` == 0])$p.value,
                                      t.test(comp.ja$`Prop. Married`[comp.ja$`Exclusion Zone` == 1], comp.ja$`Prop. Married`[comp.ja$`Exclusion Zone` == 0])$p.value,
                                      t.test(comp.ja$`Prop. Under 20`[comp.ja$`Exclusion Zone` == 1], comp.ja$`Prop. Under 20`[comp.ja$`Exclusion Zone` == 0])$p.value,
                                      t.test(comp.ja$`Pop. Completed HS`[comp.ja$`Exclusion Zone` == 1], comp.ja$`Prop. Under 20`[comp.ja$`Exclusion Zone` == 0])$p.value,
                                      t.test(comp.ja$`Median Yrs. Ed.`[comp.ja$`Exclusion Zone` == 1], comp.ja$`Median Yrs. Ed.`[comp.ja$`Exclusion Zone` == 0])$p.value,
                                      t.test(comp.ja$`Prop. Employed`[comp.ja$`Exclusion Zone` == 1], comp.ja$`Prop. Employed`[comp.ja$`Exclusion Zone` == 0])$p.value,
                                      t.test(comp.ja$`Prop. Professional`[comp.ja$`Exclusion Zone` == 1],comp.ja$`Prop. Professional`[comp.ja$`Exclusion Zone` == 0])$p.value,
                                      t.test(comp.ja$`Prop. Farmers`[comp.ja$`Exclusion Zone` == 1],comp.ja$`Prop. Farmers`[comp.ja$`Exclusion Zone` == 0])$p.value))
    
    colnames(diffmeans) <- c("Variable", "p Value")
    
    
      stargazer(diffmeans, label = "tab:japopcensus3", title = "p Values from Two-Sided Difference-in-Means Tests on Exclusion Zone Boundary in 1940", rownames = F, colnames = T, column.sep.width = "-6pt", digits = 2,
                summary = F, out = "si_table15.tex", no.space = T, font.size = "scriptsize")
      
    
################################################################################################################################################################################################################################################
# Balance Across Camps in WRA Data
################################################################################################################################################################################################################################################
    wra$age <- wra$birthyear
    wra$age[wra$age == "7"] <- "07"
    wra$age[wra$age == "9"] <- "09"
    wra$age <- ifelse(as.numeric(wra$birthyear) > 42, paste("18", wra$age, sep=""), paste("19", wra$age, sep = ""))
    wra$age <- 1942 - as.numeric(wra$age)
    
    wrabal <- wra %>% 
      group_by(camp.name) %>%
      summarise(count = n(),
                gendermale = round(mean(ifelse(gender %in% c("4", "2", "5", "1", "0", "3"), 1, 0)), 2),
                married = round(mean(ifelse(gender %in% c("7", "2"), 1, 0)), 2),
                usborn = round(mean(ifelse(!(birthplace %in% c("95", "94", "9&", "9-", "92", "90", "99", "93", "91", "98", "96", "97", "82", "", "83", "80", "84", "89", "--", "&&")), 1, 0)), 2),
                age = round(mean(age), 2),
                degree = round(mean(ifelse(degree != "0", 1, 0)), 2),
                hs = round(mean(ifelse(grade %in% c("D", "I", "E", "-"), 1, 0)), 2),
                lesshs = round(mean(ifelse(grade %in% c("0", "S", "K", "T", "L", "U", "M", "V", "N", "W", "O", "X", "P", "Y", "Q", "Z", "R", "A", "F", "B", "G", "C", "H", "J"), 1, 0)), 2),
                farmjob = round(mean(ifelse(qualified_job_1 %in% c("3-1", "3-0", "3-3", "3-4", "3-2", "307", "301", "302", "012", "303", "304", "336", "317", "312", "313", "314", "315", "316", "311", "318", "319", "337", "335", "3X1", "305",
                                                             "306", "308", "309"), 1, 0)), 2)) %>%
      filter(is.na(camp.name)== 0)
    
    colnames(wrabal) <- c("Camp", "Internees", "% Male", "% Married", "U.S. Born", "Age", "Degree", "Just HS", "Less than HS", "Agricultural Worker")
    
    stargazer(wrabal, label = "tab:wrbal", title = "Balance Statistics by Camp for Internees in the WRA Data", rownames = F, colnames = T, column.sep.width = "-6pt", digits = 2,
              summary = F, out = "si_table5.tex", no.space = T, font.size = "scriptsize")
    
################################################################################################################################################################################################################################################
# Representativeness for Interned Population
################################################################################################################################################################################################################################################
# Census Pop by Age Benchmarks (so I can disambiguate the 30-50 2 dig birthyears in the WRA) - this is Japanese Americans by age
  cenpop <- read.csv("census1940.csv", stringsAsFactors = F, header = T)

# Estimate Generations for People in WRA. This isn't provided in the data, but we have parents' birthplaces so we can use the definitions of each generation to get estimates of who belongs where
# Get Issei (people born in Japan who emigrated to US)
  sum(wra$birthplace %in% c("95", "94", "9&", "9-", "92", "90", "99", "93", "91", "98", "96", "97"))  / nrow(wra)

  wra$generation <- rep(NA, nrow(wra))

  wra$generation[wra$birthplace %in% c("95", "94", "9&", "9-", "92", "90", "99", "93", "91", "98", "96", "97")] <- "Issei"

# Get Nisei: People born in any non-Japanese country to at least one Issei parent
  sum(!(wra$birthplace %in% c("95", "94", "9&", "9-", "92", "90", "99", "93", "91", "98", "96", "97")) & wra$birthcountry %in% c("3", "7", "1", "C", "4", "B", "D", "2", "A"))   / nrow(wra)

  wra$generation[!(wra$birthplace %in% c("95", "94", "9&", "9-", "92", "90", "99", "93", "91", "98", "96", "97")) & wra$birthcountry %in% c("3", "7", "1", "C", "4", "B", "D", "2", "A")] <- "Nisei"

# Get Sansei: people born to at least one Nisei parent  
  sum(!(wra$birthplace %in% c("95", "94", "9&", "9-", "92", "90", "99", "93", "91", "98", "96", "97")) & wra$birthcountry %in% c("9", "U", "6", "V", "Z", "M", "8", "L", "5"))  / nrow(wra)
  
  wra$generation[!(wra$birthplace %in% c("95", "94", "9&", "9-", "92", "90", "99", "93", "91", "98", "96", "97")) & wra$birthcountry %in% c("9", "U", "6", "V", "Z", "M", "8", "L", "5")] <- "Sansei"

# Look at Age Distribution
# Format Birth Years in WRA
  table(wra$birthyear)
  wra$birthyear[wra$birthyear == "7"] <- "07"; wra$birthyear[wra$birthyear == "9"] <- "09" 
  wra$birthyear4 <- ifelse(as.numeric(wra$birthyear) < 43, paste("19", wra$birthyear, sep = ""), ifelse(as.numeric(wra$birthyear > 42), paste("18", wra$birthyear, sep = ""), NA)) 

# Generate Birth Years in JARP
  full$birthyr <- ifelse(full$generation == "Issei", 1962 - full$age, ifelse(full$generation %in% c("Nisei", "Sansei"), 1967 - full$age, NA))

# Compare Distributions: just have to condition on internment in JARP since that's the only peopel that could be in the WRA
  data <- data.frame(rbind(cbind(wra$birthyear4, rep("WRA", length(wra$birthyear4))),
                           cbind(full$birthyr[full$treatmentstatus %in% c("self.fam", "self.only")], rep("JARP", length(full$birthyr[full$treatmentstatus %in% c("self.fam", "self.only")])))))
  colnames(data) <- c("birthyr", "Data")
  data$birthyr <- as.numeric(data$birthyr)

  pdf("si_fig10.pdf")
    ggplot(data = data, aes(x = birthyr, fill = Data)) + geom_density(alpha = 0.25) + xlab("Birth Year") + ylab("Density")
  dev.off()

  t.test(data$birthyr[data$Data == "WRA"], data$birthyr[data$Data == "JARP"])
  ks.test(x = data$birthyr[data$Data == "WRA"], data$birthyr[data$Data == "JARP"])

# Gender - looks good
  t.test(full$gender[full$treatmentstatus %in% c("self.only", "self.fam")] == 0, wra$gender %in% c("4", "2", "5", "1", "0", "3"))
  prop.test(x = c(sum(full$gender[full$treatmentstatus %in% c("self.only", "self.fam")] == 0, na.rm = T), sum(wra$gender %in% c("4", "2", "5", "1", "0", "3"))),
            n = c(length(full$gender[is.na(full$gender) == 0 & full$treatmentstatus %in% c("self.only", "self.fam")]), length(wra$gender)))

# Education
# JARP
  table(full$n.education)
  table(full$s.education)
  table(full$i.edjpn)
  
  full$i.edjpn.n <- full$i.edjpn
  full$i.edjpn.n[which(full$i.edjpn.n == "None")] <- 0
  full$i.edjpn.n <- as.numeric(full$i.edjpn.n)

  full$education <- rep(NA, nrow(full))
  full$education[which(full$i.edjpn.n < 1)] <- "None"
  full$education[which(full$i.edjpn.n > 0 & full$i.edjpn.n < 12)] <- "Less than High School"
  full$education[which(full$i.edjpn.n == 12)] <- "High School Graduate"
  full$education[which(full$i.edjpn.n > 12 & full$i.edjpn.n < 16)] <- "Some College"
  full$education[which(full$i.edjpn.n == 16)] <- "College Graduate"
  full$education[which(full$i.edjpn.n > 16)] <- "Graduate Work"
  
  full$education[which(full$n.education == "Inapplicable: never attended school")] <- "None"
  full$education[which(full$n.education %in% c("1-4 grades/years", "5-7 grades/years", "8 grades/years", "9-11 grades/years"))] <- "Less than High School"
  full$education[which(full$n.education %in% c("12 years (high school graduate)"))] <- "High School Graduate"
  full$education[which(full$n.education %in% c("13-15 years (some college)"))] <- "Some College"
  full$education[which(full$n.education %in% c("16 years (completed college)"))] <- "College Graduate"
  full$education[which(full$n.education %in% c("More than 16 years years; some graduate work"))] <- "Graduate Work"

  full$education[which(full$s.education == "Less than high school")] <- "Less than High School"
  full$education[which(full$s.education == "12 years (high school grad)")] <- "High School Graduate"
  full$education[which(full$s.education == "13-15 years (some college)")] <- "Some College"
  full$education[which(full$s.education == "16 years (completed college)")] <- "College Graduate"
  full$education[which(full$s.education == "More than 16 years (some graduate work)")] <- "Graduate Work"

# WRA    
  wra$education <- rep(NA, nrow(wra))
  wra$education[which(wra$grade == "J")] <- "None"
  wra$education[which(wra$grade %in% c("0", "S", "K", "T", "L", "U", "M", "V", "N", "W", "O", "X", "P", "Y", "Q", "Z", "R", "A", "F", "B", "G", "C", "H"))] <- "Less than High School"
  wra$education[which(wra$grade %in% c("D", "I", "E", "-"))] <- "High School Graduate"
  wra$education[which(wra$grade %in% c("1", "5", "2", "6", "3", "7"))] <- "Some College"
  wra$education[which(wra$grade %in% c("4", "8"))] <- "College Graduate"
  wra$education[which(wra$grade %in% c("9"))] <- "Graduate Work"

  fulled <- full[full$treatmentstatus %in% c("self.only", "self.fam") & full$generation == "Issei",] %>% 
    group_by(education) %>% 
    summarize(x = n()) %>% 
    filter(!is.na(education)) %>% 
    mutate(pct = x / sum(x),
           Data = "JARP")

  wraed <- wra[wra$generation == "Issei",] %>% 
    group_by(education) %>% 
    summarize(x = n()) %>% 
    filter(!is.na(education)) %>% 
    mutate(pct = x / sum(x),
           Data = "WRA")

  education <- data.frame(rbind(fulled, wraed))
  education$education <- factor(education$education, levels = c("None", "Less than High School", "High School Graduate", "Some College", "College Graduate", "Graduate Work"), ordered = T)
  education$education <- revalue(education$education, c("None" = "None", "Less than High School" = "Less than \n High School", "High School Graduate" = "High School \n Graduate",
                                                        "Some College" = "Some \n College", "College Graduate" = "College \n Graduate", "Graduate Work" = "Graduate \n Work"))

  pdf("si_fig11.pdf")
  ggplot(education, aes(x = education, y = pct, fill = Data, color = Data)) + geom_bar(stat = "identity", position = position_dodge()) + xlab("") + ylab("Percentage of Issei") +
    scale_fill_brewer(palette = "Blues") + scale_color_manual(values = rep("black", 2))
  dev.off()
